Data Preparation
Read In
input_dir <- fs::path("../input")
files <- fs::dir_ls(input_dir, glob = "*.csv")
his_dt <- read_csv(
files[1],
col_types = cols(
package = col_character(),
version = col_character(),
date = col_date(format = "%Y-%m-%d"),
repository = col_character()
)
)
ov_dt <- read_csv(
files[2],
col_types = cols(
package = col_character(),
version = col_character(),
depends = col_character(),
imports = col_character(),
license = col_character(),
needs_compilation = col_logical(),
author = col_character(),
bug_reports = col_character(),
url = col_character(),
date_published = col_date(format = "%Y-%m-%d"),
description = col_character(),
title = col_character()
)
)
Quick View
dplyr::glimpse(ov_dt, 100)
Rows: 18,388
Columns: 12
$ package <chr> "A3", "AATtools", "ABACUS", "abbreviate", "abbyyR", "abc", "abc.data", "…
$ version <chr> "1.0.0", "0.0.1", "1.0.0", "0.1", "0.5.5", "2.2.1", "1.0", "0.9.0", "1.0…
$ depends <chr> "R (>= 2.15.0), xtable, pbapply", "R (>= 3.6.0)", "R (>= 3.1.0)", NA, "R…
$ imports <chr> NA, "magrittr, dplyr, doParallel, foreach", "ggplot2 (>= 3.1.0), shiny (…
$ license <chr> "GPL (>= 2)", "GPL-3", "GPL-3", "GPL-3", "MIT + file LICENSE", "GPL (>= …
$ needs_compilation <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, TRU…
$ author <chr> "Scott Fortmann-Roe", "Sercan Kahveci [aut, cre]", "Mintu Nath [aut, cre…
$ bug_reports <chr> NA, "https://github.com/Spiritspeak/AATtools/issues", NA, NA, "http://gi…
$ url <chr> NA, NA, "https://shiny.abdn.ac.uk/Stats/apps/", "https://github.com/sigb…
$ date_published <date> 2015-08-16, 2020-06-14, 2019-09-20, 2021-12-14, 2019-06-25, 2022-05-19,…
$ description <chr> "Supplies tools for tabulating and analyzing the results of predictive m…
$ title <chr> "Accurate, Adaptable, and Accessible Error Metrics for Predictive\nModel…
Questions
- How long did packages take from their first release to version
1.0?
- What type of packages were most frequent in different years?
- Who are the most productive authors?
- Can you predict the growth toward 2025?
- What license is most used? Has there been a change over time?
- How many packages use all CAPS, all small, or a mixture?
- How have the dependencies & imports changed over time?
- Which repositories do packages use? Github/Bitbucket etc. How do
these vary over time?
- Do packages have URLs for bug reports?
- Is there any temporal patterns to when versions are submitted to
CRAN?
- Have titles & descriptions gotten longer over time?
- Do authors use minor versions?
Features
Separate version
ov_dt <- ov_dt |>
separate(
version,
into =
c("major", "minor", "patch"),
sep = "\\.",
extra = "merge",
fill = "right",
remove = FALSE
)
Number of dependencies
ov_dt <- ov_dt |>
mutate(
num_dep = purrr::map_int(
.x = depends,
.f = function(x){
x |>
stringr::str_split(",", simplify = TRUE) |>
length()
}
),
num_dep = ifelse(is.na(depends), 0, num_dep)
)
Number of imports
ov_dt <- ov_dt |>
mutate(
num_imports = purrr::map_int(
.x = imports,
.f = function(x){
x |>
stringr::str_split(",", simplify = TRUE) |>
length()
}
),
num_imports = ifelse(is.na(imports), 0, num_imports)
)
Number of authors
ov_dt <- ov_dt |>
mutate(
num_authors = purrr::map_int(
.x = author,
.f = function(x){
x |>
stringr::str_split(",", simplify = TRUE) |>
length()
}
)
)
Temporal features
ov_dt <- ov_dt |>
mutate(
year = lubridate::year(date_published),
month = lubridate::month(date_published, label = TRUE),
day = lubridate::day(date_published),
wday = lubridate::wday(date_published, label = TRUE),
yr_mon = sprintf("%d-%s", year, month),
dt = lubridate::ym(paste0(year, "-", month))
)
Warning: 1 failed to parse.
Title & Description Lengths
ov_dt <- ov_dt |>
mutate(
len_title = purrr::map_int(title, ~ stringr::str_count(.x, "\\w+")),
len_desc = purrr::map_int(description, ~ stringr::str_count(.x, "\\w+"))
)
License
ov_dt <- ov_dt |>
mutate(
license_cleaned = case_when(
stringr::str_detect(license, "^GPL-3") ~ "GPL-3",
stringr::str_detect(license, "^GPL\\s\\([\\s\\d\\.<=>]*3") ~ "GPL-3",
stringr::str_detect(license, "^GPL-2") ~ "GPL-2",
stringr::str_detect(license, "^GPL\\s\\([\\s\\d\\.<=>]*2") ~ "GPL-2",
stringr::str_detect(license, "^AGPL") ~ "AGPL",
stringr::str_detect(license, "^LGPL") ~ "LGPL",
stringr::str_detect(license, "Apache") ~ "Apache",
stringr::str_detect(license, "BSD") ~ "BSD",
stringr::str_detect(license, "LGPL") ~ "LGPL",
# stringr::str_detect(license, "GNU") ~ "GNU",
stringr::str_detect(license, "MIT") ~ "MIT",
stringr::str_detect(license, "CC0") ~ "CC0",
# stringr::str_detect(license, "MPL") ~ "MPL",
# stringr::str_detect(license, "Unlimited") ~ "Unlimited",
# stringr::str_detect(license, "^CC") ~ "CC",
license == "GPL" ~ "GPL",
TRUE ~ "Other"
)
)
Bug Report Domain
Temporal Questions
How long did packages take from their first release to version
1.0?
What type of packages were most frequent in different
years?
Who are the most productive authors?
Can you predict the growth toward 2025?
What license is most used? Has there been a change over time? -
done
How many packages use all CAPS, all small, or a mixture?
How have the dependencies & imports changed over
time?
Which repositories do packages use? Github/Bitbucket etc. How do
these vary over time?
Do packages have URLs for bug reports?
Is there any temporal patterns to when versions are submitted to
CRAN?
Do authors use minor versions?
Have titles & descriptions gotten longer over time? -
done
How have the dependencies & imports changed over
time?
ov_dt |>
group_by(dt) |>
summarise_at(vars(num_dep, num_imports), list(mean = mean)) |>
ggplot(aes(x= dt)) +
geom_jitter(aes(y = num_dep_mean, color = "num_dep_mean"), alpha = 0.2) +
geom_smooth(aes(y = num_dep_mean, color = "num_dep_mean"), span = 0.3, se = FALSE) +
geom_jitter(aes(y = num_imports_mean, color = "num_imports_mean"), alpha = 0.2) +
geom_smooth(aes(y = num_imports_mean, color = "num_imports_mean"), span = 0.3, se = FALSE) +
theme_light()

- Have titles & descriptions gotten longer over time?
ov_dt |>
group_by(dt) |>
summarise_at(vars(len_title, len_desc), list(median = median, sd = sd), na.rm = TRUE) |>
ggplot(aes(x= dt)) +
geom_jitter(aes(y = len_title_median, color = "len_title_median"), alpha = 0.2) +
geom_smooth(aes(y = len_title_median, color = "len_title_median"), span = 0.3, se = FALSE) +
geom_jitter(aes(y = len_desc_median, color = "len_desc_median"), alpha = 0.2) +
geom_smooth(aes(y = len_desc_median, color = "len_desc_median"), span = 0.3, se = FALSE) +
theme_light()


ov_dt |>
ggplot(aes(x= date_published, y = len_title)) +
geom_jitter(alpha = 0.05) +
geom_smooth(span = 0.1, se = FALSE) +
theme_light() +
scale_y_log10()

ov_dt |>
ggplot(aes(x= date_published, y = len_desc)) +
geom_jitter(alpha = 0.05) +
geom_smooth(span = 0.2, se = FALSE) +
theme_light() +
scale_y_log10()

- What license is most used? Has there been a change over time?
ov_dt |>
group_by(license_cleaned) |>
count() |>
ggplot(aes(x = forcats::fct_reorder(license_cleaned, n), y = n, fill = license_cleaned)) +
geom_col() +
coord_flip() +
theme_minimal() +
guides(fill = FALSE) +
labs(x = "", y = "")
Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.

ov_dt |>
group_by(dt) |>
count(license_cleaned) |>
mutate(license_cleaned = forcats::fct_reorder(license_cleaned, n)) |>
ggplot(aes(x= dt, y = n, color = license_cleaned)) +
# geom_line( alpha = 0.3) +
geom_jitter(alpha = 0.3) +
geom_smooth(span = 0.3, se = FALSE) +
theme_light()

ov_dt |>
group_by(dt) |>
count(license_cleaned) |>
mutate(license_cleaned = forcats::fct_reorder(license_cleaned, n)) |>
ggplot(aes(x= dt, y = n, color = license_cleaned)) +
# geom_line( alpha = 0.3) +
geom_jitter(alpha = 0.3) +
geom_smooth(span = 0.8, se = FALSE) +
theme_light() +
scale_y_log10()

- Do packages have URLs for bug reports?
ov_dt |>
group_by(dt) |>
count(url_exist = is.na(url)) |>
ggplot(aes(x= dt, y = n, color = url_exist)) +
geom_jitter(alpha = 0.3) +
geom_smooth(span = 0.3, se = FALSE) +
theme_light()

ov_dt |>
group_by(dt) |>
count(url_exist = is.na(bug_reports)) |>
ggplot(aes(x= dt, y = n, color = url_exist)) +
geom_jitter(alpha = 0.3) +
geom_smooth(span = 0.3, se = FALSE) +
theme_light()

- Which repositories do packages use? Github/Bitbucket etc. How do
these vary over time?

- Is there any temporal patterns to when versions are submitted to
CRAN?
ov_dt |>
group_by(dt) |>
count() |>
ggplot(aes(dt, n)) +
geom_line()
ov_dt |> filter(!is.na(dt)) |> count(dt) |> arrange(dt) |> timetk::pad_by_time(.by = "month", .pad_value = 0) -> xdat
.date_var is missing. Using: dt
timetk::plot_seasonal_diagnostics(xdat, dt, n)
---
title: "CRAN History EDA"
author: "R Sangole"
output: html_notebook
---

```{r libraries, message=FALSE, warning=FALSE}
library(dplyr)
library(tidyr)
library(readr)
library(ggplot2)
library(lattice)
library(naniar)
library(skimr)
```

# Introduction



# Data Preparation {.tabset}

## Read In {.tabset}

```{r}
input_dir <- fs::path("../input")
files <- fs::dir_ls(input_dir, glob = "*.csv")
his_dt <- read_csv(
  files[1],
  col_types = cols(
    package = col_character(),
    version = col_character(),
    date = col_date(format = "%Y-%m-%d"),
    repository = col_character()
  )
)
ov_dt <- read_csv(
  files[2],
  col_types = cols(
    package = col_character(),
    version = col_character(),
    depends = col_character(),
    imports = col_character(),
    license = col_character(),
    needs_compilation = col_logical(),
    author = col_character(),
    bug_reports = col_character(),
    url = col_character(),
    date_published = col_date(format = "%Y-%m-%d"),
    description = col_character(),
    title = col_character()
  )
)
```

## Quick View {.tabset}

```{r}
dplyr::glimpse(ov_dt, 100)
```



# Data Quality

```{r}
ov_dt |> 
  dplyr::arrange(date_published) |> 
  vis_miss()
```

# Questions

* How long did packages take from their first release to version 1.0? 
* What type of packages were most frequent in different years?
* Who are the most productive authors? 
* Can you predict the growth toward 2025?
* What license is most used? Has there been a change over time?
* How many packages use all CAPS, all small, or a mixture?
* How have the dependencies & imports changed over time?
* Which repositories do packages use? Github/Bitbucket etc. How do these vary over time?
* Do packages have URLs for bug reports?
* Is there any temporal patterns to when versions are submitted to CRAN?
* Have titles & descriptions gotten longer over time?
* Do authors use minor versions?

## Features

Separate version

```{r}
ov_dt <- ov_dt |>
  separate(
    version,
    into =
      c("major", "minor", "patch"),
    sep = "\\.",
    extra = "merge",
    fill = "right",
    remove = FALSE
  )
```


Number of dependencies

```{r}
ov_dt <- ov_dt |> 
  mutate(
    num_dep = purrr::map_int(
      .x = depends,
      .f = function(x){
        x |> 
          stringr::str_split(",", simplify = TRUE) |> 
          length()
      }
    ),
    num_dep = ifelse(is.na(depends), 0, num_dep)
  )
```

Number of imports

```{r}
ov_dt <- ov_dt |> 
  mutate(
    num_imports = purrr::map_int(
      .x = imports,
      .f = function(x){
        x |> 
          stringr::str_split(",", simplify = TRUE) |> 
          length()
      }
    ),
    num_imports = ifelse(is.na(imports), 0, num_imports)
  )
```

Number of authors

```{r}
ov_dt <- ov_dt |> 
  mutate(
    num_authors = purrr::map_int(
      .x = author,
      .f = function(x){
        x |> 
          stringr::str_split(",", simplify = TRUE) |> 
          length()
      }
    )
  )
```

Temporal features
```{r}
ov_dt <- ov_dt |> 
  mutate(
    year = lubridate::year(date_published),
    month = lubridate::month(date_published, label = TRUE),
    day = lubridate::day(date_published),
    wday = lubridate::wday(date_published, label = TRUE),
    yr_mon = sprintf("%d-%s", year, month),
    dt = lubridate::ym(paste0(year, "-", month))
  )
```


Title & Description Lengths

```{r}
ov_dt <- ov_dt |>
  mutate(
    len_title = purrr::map_int(title, ~ stringr::str_count(.x, "\\w+")),
    len_desc = purrr::map_int(description, ~ stringr::str_count(.x, "\\w+"))
  )
```


License 

```{r}
ov_dt <- ov_dt |> 
  mutate(
    license_cleaned = case_when(
      stringr::str_detect(license, "^GPL-3") ~ "GPL-3",
      stringr::str_detect(license, "^GPL\\s\\([\\s\\d\\.<=>]*3") ~ "GPL-3",
      stringr::str_detect(license, "^GPL-2") ~ "GPL-2",
      stringr::str_detect(license, "^GPL\\s\\([\\s\\d\\.<=>]*2") ~ "GPL-2",
      stringr::str_detect(license, "^AGPL") ~ "AGPL",
      stringr::str_detect(license, "^LGPL") ~ "LGPL",
      stringr::str_detect(license, "Apache") ~ "Apache",
      stringr::str_detect(license, "BSD") ~ "BSD",
      stringr::str_detect(license, "LGPL") ~ "LGPL",
      # stringr::str_detect(license, "GNU") ~ "GNU",
      stringr::str_detect(license, "MIT") ~ "MIT",
      stringr::str_detect(license, "CC0") ~ "CC0",
      # stringr::str_detect(license, "MPL") ~ "MPL",
      # stringr::str_detect(license, "Unlimited") ~ "Unlimited",
      # stringr::str_detect(license, "^CC") ~ "CC",
      license == "GPL" ~ "GPL",
      TRUE ~ "Other"
      )
  )
```

Bug Report Domain

```{r}
ov_dt <- ov_dt |>
  mutate(domain = purrr::map_chr(bug_reports,
                                 ~ {
                                   if (is.na(.x))
                                     return("")
                                   else
                                     return(urltools::url_parse(.x)$domain)
                                 }))
```


# Temporal Questions

* How long did packages take from their first release to version 1.0? 
* What type of packages were most frequent in different years?
* Who are the most productive authors? 
* Can you predict the growth toward 2025?
* What license is most used? Has there been a change over time? - done
* How many packages use all CAPS, all small, or a mixture?
* How have the dependencies & imports changed over time?
* Which repositories do packages use? Github/Bitbucket etc. How do these vary over time?
* Do packages have URLs for bug reports?
* Is there any temporal patterns to when versions are submitted to CRAN?
* Do authors use minor versions?
* Have titles & descriptions gotten longer over time? - done


* How have the dependencies & imports changed over time?

```{r}
ov_dt |> 
    group_by(dt) |> 
    summarise_at(vars(num_dep, num_imports), list(mean = mean)) |> 
  ggplot(aes(x= dt)) +
    geom_jitter(aes(y = num_dep_mean, color = "num_dep_mean"), alpha = 0.2) +
  geom_smooth(aes(y = num_dep_mean, color = "num_dep_mean"), span = 0.3, se = FALSE) +
  geom_jitter(aes(y = num_imports_mean, color = "num_imports_mean"), alpha = 0.2) +
  geom_smooth(aes(y = num_imports_mean, color = "num_imports_mean"), span = 0.3, se = FALSE) +
  theme_light()
```

* Have titles & descriptions gotten longer over time? 

```{r}
ov_dt |> 
    group_by(dt) |> 
    summarise_at(vars(len_title, len_desc), list(median = median, sd = sd), na.rm = TRUE) |> 
  ggplot(aes(x= dt)) +
    geom_jitter(aes(y = len_title_median, color = "len_title_median"), alpha = 0.2) +
  geom_smooth(aes(y = len_title_median, color = "len_title_median"), span = 0.3, se = FALSE) +
  geom_jitter(aes(y = len_desc_median, color = "len_desc_median"), alpha = 0.2) +
  geom_smooth(aes(y = len_desc_median, color = "len_desc_median"), span = 0.3, se = FALSE) +
  theme_light()
```

```{r}
ov_dt |> 
  filter(year %in% c(2022, 2020, 2018)) |> 
  ggplot() +
  geom_density(aes(x = len_desc, 
                   fill = as.factor(year), 
                   color = as.factor(year)
                   ),
               alpha = 0.3
               )
ov_dt |> 
  filter(year %in% c(2022, 2020, 2018)) |> 
  ggplot() +
  geom_histogram(aes(x = len_desc, 
                   fill = as.factor(year), 
                   color = as.factor(year)
                   ),
               alpha = 0.3
               ) +
  facet_wrap(~year)
```



```{r}
ov_dt |> 
  ggplot(aes(x= date_published, y = len_title)) +
  geom_jitter(alpha = 0.05) +
  geom_smooth(span = 0.1, se = FALSE) +
  theme_light() +
  scale_y_log10()

ov_dt |> 
  ggplot(aes(x= date_published, y = len_desc)) +
  geom_jitter(alpha = 0.05) +
  geom_smooth(span = 0.2, se = FALSE) +
  theme_light() +
  scale_y_log10()
```

```{r}

```


* What license is most used? Has there been a change over time?

```{r}
ov_dt |> 
  group_by(license_cleaned) |> 
  count() |> 
  ggplot(aes(x = forcats::fct_reorder(license_cleaned, n), y = n, fill = license_cleaned)) +
  geom_col() +
  coord_flip() +
  theme_minimal() +
  guides(fill = FALSE) +
  labs(x = "", y = "")
```

```{r}
ov_dt |> 
  group_by(dt) |> 
  count(license_cleaned) |> 
  mutate(license_cleaned = forcats::fct_reorder(license_cleaned, n)) |> 
  ggplot(aes(x= dt, y = n, color = license_cleaned)) +
  # geom_line( alpha = 0.3) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.3, se = FALSE) +
  theme_light()
ov_dt |> 
  group_by(dt) |> 
  count(license_cleaned) |> 
  mutate(license_cleaned = forcats::fct_reorder(license_cleaned, n)) |> 
  ggplot(aes(x= dt, y = n, color = license_cleaned)) +
  # geom_line( alpha = 0.3) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.8, se = FALSE) +
  theme_light() +
  scale_y_log10()
```

* Do packages have URLs for bug reports?


```{r}
ov_dt |> 
  group_by(dt) |> 
  count(url_exist = is.na(url)) |>  
  ggplot(aes(x= dt, y = n, color = url_exist)) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.3, se = FALSE) +
  theme_light()
ov_dt |> 
  group_by(dt) |> 
  count(url_exist = is.na(bug_reports)) |>  
  ggplot(aes(x= dt, y = n, color = url_exist)) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.3, se = FALSE) +
  theme_light()
```

* Which repositories do packages use? Github/Bitbucket etc. How do these vary over time?

```{r}
ov_dt |> 
  filter(domain != "") |> 
  mutate(domain = forcats::fct_lump_min(domain, 20)) |> 
  group_by(dt) |> 
  count(domain) |>  
  ggplot(aes(x= dt, y = n, color = domain)) +
  geom_jitter(alpha = 0.3) +
  geom_smooth(span = 0.5, se = FALSE) +
  theme_light()
```

* Is there any temporal patterns to when versions are submitted to CRAN?

```{r}
ov_dt |> 
  group_by(dt) |> 
  count() |> 
  ggplot(aes(dt, n)) +
  geom_line()

ov_dt |> 
  filter(!is.na(dt)) |> 
  count(dt) |> 
  arrange(dt) |> 
  timetk::pad_by_time(.by = "month", .pad_value = 0) -> xdat
timetk::plot_seasonal_diagnostics(xdat, dt, n)
```

